#Analysis of the Sustainable Communities metrics data.
First, we read in the raw data. In this case, we start with just the original raw values. This is a simplifed version of the XLSX file that only includes those columns. We will produce z-scores here. Note that the data live in the “data” subdirectory.
library(readr)
library(here)
CommunityData <- read_csv(here("data/CommunityData-raw-2015-v3.csv"))
Rows: 886 Columns: 22── Column specification ────────────────────────────────────────
Delimiter: ","
chr (2): ME, MELSAD
dbl (20): GEOID, AQI_Good, Bachelor_Over_25, Per_Poverty, Gi...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#View(CommunityData)
#set the row names in the table
rownames(CommunityData)=CommunityData$ME
Warning: Setting row names on a tibble is deprecated.
#summary(CommunityData)
Having read in the data, now set rownames so we can later do a cool plot of the values with labels. Note that at this point it make sense to omit the data with missing values (NA). We can do fancier stuff later to see what happens if we estimate those.
mydata <- CommunityData[,c(4:ncol(CommunityData))]
#View(mydata)
mydata = as.data.frame(unclass(mydata))
rownames(mydata)=CommunityData$ME
#summary(mydata)
#dim(mydata)
# We can now remove any records that have NAs
myDataClean = na.omit(mydata)
#dim(myDataClean)
#summary(myDataClean)
Now, let’s scale the data so that the variance in the columns is comparable (assuming we want to treat each dimension as equivalent). Here, we change the raw scores to z-scores.
## z-scores
scaled_data<- as.matrix(scale(myDataClean, center = TRUE, scale = TRUE))
#View(scaled_data)
Now, we will do a kmeans analysis and use the elbow method to see if there are optimal clusters. Note we are starting with all the orginal data.
kmm = kmeans(scaled_data,11,nstart = 50,iter.max = 15)
#kmm
The kmeans analysis will go through the 1 to 20 clusters.
#Elbow Method for finding the optimal number of clusters
set.seed(123)
# Compute and plot wss for k = 2 to k = 20
k.max <- 20
data <- scaled_data
wss <- sapply(1:k.max,
function(k){kmeans(data, k, nstart=50,iter.max = 15 )$tot.withinss})
wss
[1] 15732.000 13704.329 12674.635 12030.804 11438.735 10966.816
[7] 10558.709 10188.264 9896.572 9614.815 9359.013 9129.764
[13] 8951.403 8800.253 8649.838 8505.266 8374.320 8271.117
[19] 8145.538 8049.695
plot(1:k.max, wss,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
Now we have a scree plot showing the number of clusters and the sum of squares. Hmmm. It is not clear if there is a clear number of clusters. We can try using BIC to see if there is an optimal group of clusters. This uses the mclust package.
library(mclust)
d_clust <- Mclust(as.matrix(scaled_data), G=1:15, modelNames = mclust.options("emModelNames"))
fitting ...
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======== | 10%
|
|======== | 11%
|
|========= | 12%
|
|========= | 13%
|
|========== | 13%
|
|========== | 14%
|
|=========== | 14%
|
|=========== | 15%
|
|============ | 16%
|
|============ | 17%
|
|============= | 17%
|
|============= | 18%
|
|============== | 18%
|
|============== | 19%
|
|=============== | 20%
|
|=============== | 21%
|
|================ | 21%
|
|================ | 22%
|
|================= | 23%
|
|================== | 24%
|
|================== | 25%
|
|=================== | 25%
|
|=================== | 26%
|
|==================== | 27%
|
|===================== | 28%
|
|===================== | 29%
|
|====================== | 29%
|
|====================== | 30%
|
|======================= | 31%
|
|======================= | 32%
|
|======================== | 32%
|
|======================== | 33%
|
|========================= | 33%
|
|========================= | 34%
|
|========================== | 35%
|
|========================== | 36%
|
|=========================== | 36%
|
|=========================== | 37%
|
|============================ | 37%
|
|============================ | 38%
|
|============================= | 39%
|
|============================= | 40%
|
|============================== | 40%
|
|============================== | 41%
|
|=============================== | 41%
|
|=============================== | 42%
|
|================================ | 43%
|
|================================ | 44%
|
|================================= | 44%
|
|================================= | 45%
|
|================================== | 45%
|
|================================== | 46%
|
|=================================== | 47%
|
|=================================== | 48%
|
|==================================== | 48%
|
|==================================== | 49%
|
|===================================== | 50%
|
|====================================== | 51%
|
|====================================== | 52%
|
|======================================= | 52%
|
|======================================= | 53%
|
|======================================== | 54%
|
|======================================== | 55%
|
|========================================= | 55%
|
|========================================= | 56%
|
|========================================== | 56%
|
|========================================== | 57%
|
|=========================================== | 58%
|
|=========================================== | 59%
|
|============================================ | 59%
|
|============================================ | 60%
|
|============================================= | 60%
|
|============================================= | 61%
|
|============================================== | 62%
|
|============================================== | 63%
|
|=============================================== | 63%
|
|=============================================== | 64%
|
|================================================ | 64%
|
|================================================ | 65%
|
|================================================= | 66%
|
|================================================= | 67%
|
|================================================== | 67%
|
|================================================== | 68%
|
|=================================================== | 68%
|
|=================================================== | 69%
|
|==================================================== | 70%
|
|==================================================== | 71%
|
|===================================================== | 71%
|
|===================================================== | 72%
|
|====================================================== | 73%
|
|======================================================= | 74%
|
|======================================================= | 75%
|
|======================================================== | 75%
|
|======================================================== | 76%
|
|========================================================= | 77%
|
|========================================================== | 78%
|
|========================================================== | 79%
|
|=========================================================== | 79%
|
|=========================================================== | 80%
|
|============================================================ | 81%
|
|============================================================ | 82%
|
|============================================================= | 82%
|
|============================================================= | 83%
|
|============================================================== | 83%
|
|============================================================== | 84%
|
|=============================================================== | 85%
|
|=============================================================== | 86%
|
|================================================================ | 86%
|
|================================================================ | 87%
|
|================================================================= | 87%
|
|================================================================= | 88%
|
|================================================================== | 89%
|
|================================================================== | 90%
|
|=================================================================== | 90%
|
|=================================================================== | 91%
|
|==================================================================== | 91%
|
|==================================================================== | 92%
|
|===================================================================== | 93%
|
|===================================================================== | 94%
|
|====================================================================== | 94%
|
|====================================================================== | 95%
|
|======================================================================= | 96%
|
|======================================================================== | 97%
|
|======================================================================== | 98%
|
|========================================================================= | 98%
|
|========================================================================= | 99%
|
|==========================================================================| 100%
d_clust$BIC
Bayesian Information Criterion (BIC):
EII VII EEI VEI EVI VVI EEE VEE
1 -44814.79 -44814.79 -44935.76 -44935.76 -44935.76 -44935.76 -41192.76 -41192.76
2 -43750.43 -43198.86 -43651.43 -43154.18 -42572.59 -42473.36 -40739.34 -39886.15
3 -43340.52 -42614.63 -42934.43 -42359.22 -41279.08 -41129.33 -40151.65 -39275.42
4 -42975.02 -41938.78 -42489.55 -41559.65 -40490.89 -40327.39 -40054.84 -39090.87
5 -42632.37 -41429.81 -41981.83 -41054.40 -40009.25 -39793.63 -39882.34 -39054.89
6 -42377.62 -41323.41 -41699.24 -40780.54 -39744.32 -39419.16 -39732.79 -38820.25
7 -42271.40 -41099.12 -41562.45 -40542.73 -39503.40 -39385.52 -39678.73 -38770.86
8 -42000.88 -40887.14 -41470.35 -40355.22 -39453.54 -38965.87 -39609.70 -38774.17
9 -41945.35 -40851.71 -41179.84 -40378.06 -39313.62 -38894.03 -39546.17 -38767.72
10 -41640.57 -40672.28 -40812.40 -40143.78 -39231.30 -38922.55 -39570.95 -38576.87
11 -41601.72 -40677.40 -40849.19 -40026.87 -39105.81 -38751.32 -39572.44 -38570.91
12 -41550.54 -40447.32 -40847.95 -39836.86 -39144.78 -38761.11 -39532.85 NA
13 -41533.94 -40375.67 -40835.70 -39795.45 -39136.30 -38746.79 -39433.62 NA
14 -41545.25 -40258.69 -40876.01 -39768.63 -39242.05 -38727.50 -39496.26 NA
15 -41241.87 -40262.74 -40657.59 -39795.09 -39218.14 -38546.76 -39385.06 NA
EVE VVE EEV VEV EVV VVV
1 -41192.76 -41192.76 -41192.76 -41192.76 -41192.76 -41192.76
2 -39705.80 -39463.24 -40009.46 -39578.94 -40004.41 -39616.32
3 -38872.30 -38529.87 -39619.66 -39112.28 -39673.52 -39162.07
4 -38758.59 -38364.45 -40173.96 -39650.67 -40266.20 -39901.62
5 -38329.17 -37987.66 -40344.01 -39915.91 -40532.81 -40325.94
6 -38225.86 -37896.24 -40957.88 -40705.72 -41608.05 -41182.83
7 -38219.95 -37911.48 -41757.41 -41519.34 -42508.08 -42084.23
8 -38111.10 -37880.79 -42818.65 -42558.72 -43471.40 -43149.67
9 -38145.55 -37801.72 -43568.02 -43357.91 -44470.24 -44030.01
10 -38047.23 -37673.96 -44458.52 -44116.97 -45324.40 -44994.27
11 -38278.84 -37622.80 -45367.88 -45076.84 -46533.32 -45909.54
12 NA NA -46409.43 -45766.29 NA NA
13 NA NA -47502.25 -46675.55 NA NA
14 NA NA -48317.21 -47526.60 NA NA
15 NA NA -49003.17 -48327.91 NA NA
Top 3 models based on the BIC criterion:
VVE,11 VVE,10 VVE,9
-37622.80 -37673.96 -37801.72
plot(d_clust)
Model-based clustering plots:
1: BIC
2: classification
3: uncertainty
4: density
1
Model-based clustering plots:
1: BIC
2: classification
3: uncertainty
4: density
0
We can also go adifferent route and use NbClust to see if we can figure out how many clusters there might be.
library(NbClust)
nb <- NbClust(scaled_data, diss=NULL, distance = "euclidean",
min.nc=2, max.nc=13, method = "kmeans",
index = "all", alphaBeale = 0.1)
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 6 proposed 2 as the best number of clusters
* 7 proposed 3 as the best number of clusters
* 1 proposed 4 as the best number of clusters
* 2 proposed 6 as the best number of clusters
* 2 proposed 7 as the best number of clusters
* 1 proposed 8 as the best number of clusters
* 1 proposed 10 as the best number of clusters
* 2 proposed 12 as the best number of clusters
* 1 proposed 13 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 3
*******************************************************************
hist(nb$Best.nc[1,], breaks = max(na.omit(nb$Best.nc[1,])))
Let’s make some plots of the clusters. We will pick 11 as the number of clusters to examine based on the analysis above
res.km <- kmeans(scaled_data,11,nstart=25)
library(factoextra)
## repel is on
fviz_cluster(res.km, data = scaled_data,
geom = "text",
ellipse.type = "convex",
ggtheme = theme_bw(), labelsize = 3,repel=TRUE
)
# no repel
fviz_cluster(res.km, data = scaled_data,
geom = "text",
ellipse.type = "convex",
ggtheme = theme_bw(), labelsize = 3,repel=FALSE
)
Okay. Until now we have been using the raw data and all of the columns (though converted to z-scores). To see if we get any clarity we can reduce the dimensionality to by doing a PCA and then choosing a smaller number of dimensions to plat the variability.
This next code does the PCA.
# Proceed with principal components
pc <- princomp(scaled_data)
plot(pc)
plot(pc, type='l')
Hmmm… 9t seems like 5 components explain a majority of the variance so let’s start with 5
pc_data = pc$scores[,c(1:5)]
Now let’s do a cluster analysis based on these data. We will first use a k-means.
The k-means model is “almost” a Gaussian mixture model and one can construct a likelihood for the Gaussian mixture model and thus also determine information criterion values. We install the mclust package and we will use the Mclust method of it. Determine the optimal model and number of clusters according to the Bayesian Information Criterion for expectation-maximization, initialized by hierarchical clustering for parameterized Gaussian mixture models. In this method we had set the modelNames parameter to mclust.options(“emModelNames”) so that it includes only those models for evaluation where the number of observation is greater than the dimensions of the dataset.
kmm = kmeans(pc_data,3,nstart = 50,iter.max = 15)
#kmm
#Elbow Method for finding the optimal number of clusters
set.seed(123)
# Compute and plot wss for k = 2 to k = 20
k.max <- 20
data <- pc_data
wss <- sapply(1:k.max,
function(k){kmeans(data, k, nstart=50,iter.max = 15 )$tot.withinss})
wss
[1] 8814.175 6792.828 5784.109 5188.114 4700.984 4332.250 4036.509 3806.278
[9] 3608.112 3421.527 3286.151 3149.705 3034.848 2938.883 2833.620 2747.831
[17] 2658.917 2590.750 2520.527 2462.946
plot(1:k.max, wss,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
Therefore for k>5 the between_ss/total_ss ratio tends to change slowly and remain less changing as compared to other k’s. So how do we decide what will be the optimal choice. So we look at the second approach which comes with a new package.
We can now use BIC to also evaluate the number of clusters
library(mclust)
d_clust <- Mclust(as.matrix(pc_data), G=1:15, modelNames = mclust.options("emModelNames"))
fitting ...
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======== | 10%
|
|======== | 11%
|
|========= | 12%
|
|========= | 13%
|
|========== | 13%
|
|========== | 14%
|
|=========== | 14%
|
|=========== | 15%
|
|============ | 16%
|
|============ | 17%
|
|============= | 17%
|
|============= | 18%
|
|============== | 18%
|
|============== | 19%
|
|=============== | 20%
|
|=============== | 21%
|
|================ | 21%
|
|================ | 22%
|
|================= | 23%
|
|================== | 24%
|
|================== | 25%
|
|=================== | 25%
|
|=================== | 26%
|
|==================== | 27%
|
|===================== | 28%
|
|===================== | 29%
|
|====================== | 29%
|
|====================== | 30%
|
|======================= | 31%
|
|======================= | 32%
|
|======================== | 32%
|
|======================== | 33%
|
|========================= | 33%
|
|========================= | 34%
|
|========================== | 35%
|
|========================== | 36%
|
|=========================== | 36%
|
|=========================== | 37%
|
|============================ | 37%
|
|============================ | 38%
|
|============================= | 39%
|
|============================= | 40%
|
|============================== | 40%
|
|============================== | 41%
|
|=============================== | 41%
|
|=============================== | 42%
|
|================================ | 43%
|
|================================ | 44%
|
|================================= | 44%
|
|================================= | 45%
|
|================================== | 45%
|
|================================== | 46%
|
|=================================== | 47%
|
|=================================== | 48%
|
|==================================== | 48%
|
|==================================== | 49%
|
|===================================== | 50%
|
|====================================== | 51%
|
|====================================== | 52%
|
|======================================= | 52%
|
|======================================= | 53%
|
|======================================== | 54%
|
|======================================== | 55%
|
|========================================= | 55%
|
|========================================= | 56%
|
|========================================== | 56%
|
|========================================== | 57%
|
|=========================================== | 58%
|
|=========================================== | 59%
|
|============================================ | 59%
|
|============================================ | 60%
|
|============================================= | 60%
|
|============================================= | 61%
|
|============================================== | 62%
|
|============================================== | 63%
|
|=============================================== | 63%
|
|=============================================== | 64%
|
|================================================ | 64%
|
|================================================ | 65%
|
|================================================= | 66%
|
|================================================= | 67%
|
|================================================== | 67%
|
|================================================== | 68%
|
|=================================================== | 68%
|
|=================================================== | 69%
|
|==================================================== | 70%
|
|==================================================== | 71%
|
|===================================================== | 71%
|
|===================================================== | 72%
|
|====================================================== | 73%
|
|======================================================= | 74%
|
|======================================================= | 75%
|
|======================================================== | 75%
|
|======================================================== | 76%
|
|========================================================= | 77%
|
|========================================================== | 78%
|
|========================================================== | 79%
|
|=========================================================== | 79%
|
|=========================================================== | 80%
|
|============================================================ | 81%
|
|============================================================ | 82%
|
|============================================================= | 82%
|
|============================================================= | 83%
|
|============================================================== | 83%
|
|============================================================== | 84%
|
|=============================================================== | 85%
|
|=============================================================== | 86%
|
|================================================================ | 86%
|
|================================================================ | 87%
|
|================================================================= | 87%
|
|================================================================= | 88%
|
|================================================================== | 89%
|
|================================================================== | 90%
|
|=================================================================== | 90%
|
|=================================================================== | 91%
|
|==================================================================== | 91%
|
|==================================================================== | 92%
|
|===================================================================== | 93%
|
|===================================================================== | 94%
|
|====================================================================== | 94%
|
|====================================================================== | 95%
|
|======================================================================= | 96%
|
|======================================================================== | 97%
|
|======================================================================== | 98%
|
|========================================================================= | 98%
|
|========================================================================= | 99%
|
|==========================================================================| 100%
d_clust$BIC
Bayesian Information Criterion (BIC):
EII VII EEI VEI EVI VVI EEE VEE
1 -14930.55 -14930.55 -14565.10 -14565.10 -14565.10 -14565.10 -14632.30 -14632.30
2 -14696.62 -14659.97 -14514.13 -14405.18 -14548.86 -14403.45 -14532.31 -14460.83
3 -14621.28 -14599.03 -14483.68 -14390.42 -14537.62 -14408.53 -14473.24 -14433.99
4 -14582.89 -14461.84 -14530.75 -14394.00 -14482.88 -14403.88 -14423.14 -14403.82
5 -14546.53 -14393.55 -14519.02 -14299.40 -14441.73 -14335.99 -14503.89 -14311.14
6 -14476.28 -14391.12 -14369.86 -14296.22 -14455.78 -14336.22 -14334.87 -14267.74
7 -14432.68 -14406.01 -14378.82 -14249.96 -14421.16 -14336.10 -14338.32 -14222.17
8 -14416.58 -14328.49 -14365.52 -14230.54 -14433.29 -14339.69 -14346.14 -14219.56
9 -14413.93 -14349.17 -14284.42 -14235.36 -14406.33 -14386.67 -14348.43 -14228.86
10 -14374.11 -14312.96 -14286.48 -14207.18 -14433.70 -14425.52 -14296.94 -14255.06
11 -14383.99 -14309.77 -14278.91 -14217.30 -14447.77 -14464.15 -14311.05 -14280.97
12 -14393.85 -14334.79 -14295.64 -14239.28 -14495.42 -14458.40 -14325.22 -14255.78
13 -14392.09 -14340.97 -14332.92 -14278.42 -14536.06 -14476.87 -14361.83 -14268.28
14 -14425.80 -14359.00 -14371.10 -14313.21 -14572.79 -14510.84 -14401.28 -14304.67
15 -14390.35 -14367.18 -14389.17 -14356.93 -14551.41 -14573.53 -14402.64 -14327.15
EVE VVE EEV VEV EVV VVV
1 -14632.30 -14632.30 -14632.30 -14632.30 -14632.30 -14632.30
2 -14513.03 -14418.97 -14512.42 -14384.74 -14530.46 -14404.23
3 -14464.59 -14379.55 -14385.90 -14282.41 -14425.50 -14330.86
4 -14397.09 -14312.38 -14364.82 -14235.15 -14398.72 -14365.24
5 -14415.65 -14284.71 -14379.90 -14261.22 -14422.81 -14380.21
6 -14351.94 -14343.22 -14382.70 -14352.67 -14459.86 -14440.93
7 -14335.59 -14275.90 -14456.90 -14356.32 -14519.71 -14458.71
8 -14354.71 -14300.98 -14444.91 -14407.57 -14536.01 -14558.59
9 -14401.83 -14335.13 -14496.79 -14477.47 -14599.98 -14609.81
10 -14355.84 -14371.89 -14536.97 -14512.67 -14681.76 -14708.92
11 -14390.92 -14420.04 -14565.83 -14587.49 -14761.95 -14784.28
12 -14451.18 -14445.89 -14618.05 -14636.73 -14847.05 -14874.41
13 -14487.10 -14472.06 -14689.40 -14739.21 -14894.64 -14985.16
14 -14530.20 -14518.74 -14708.94 -14848.38 -15052.34 -15025.46
15 -14585.88 -14547.36 -14798.71 -14836.02 -15122.02 -15164.91
Top 3 models based on the BIC criterion:
VEI,10 VEI,11 VEE,8
-14207.18 -14217.30 -14219.56
plot(d_clust)
Model-based clustering plots:
1: BIC
2: classification
3: uncertainty
4: density
1
Model-based clustering plots:
1: BIC
2: classification
3: uncertainty
4: density
0
Based on these analysis, the number of clusters is either 10 (VEI) or 11 (VVE) - they are close to each other.
NbClust package provides 30 indices for determining the number of clusters and proposes to user the best clustering scheme from the different results obtained by varying all combinations of number of clusters, distance measures, and clustering methods. Let’s try that.
library(NbClust)
nb <- NbClust(pc_data, diss=NULL, distance = "euclidean",
min.nc=2, max.nc=15, method = "kmeans",
index = "all", alphaBeale = 0.1)
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 8 proposed 2 as the best number of clusters
* 6 proposed 3 as the best number of clusters
* 1 proposed 4 as the best number of clusters
* 1 proposed 5 as the best number of clusters
* 1 proposed 6 as the best number of clusters
* 1 proposed 7 as the best number of clusters
* 1 proposed 8 as the best number of clusters
* 4 proposed 15 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 2
*******************************************************************
hist(nb$Best.nc[1,], breaks = max(na.omit(nb$Best.nc[1,])))
Now some nice plots. Lets use 11 as the right # of clusters
library(cluster)
library(fpc)
clus <- kmeans(pc_data, centers=1)
#clus
plotcluster(pc_data, clus$cluster)
clusplot(pc_data, clus$cluster, color=TRUE, shade=TRUE,
labels=4, lines=0)
Some additional plots
res.km <- kmeans(pc_data,7,nstart=25)
library(factoextra)
## repel is on
fviz_cluster(res.km, data = pc_data,
geom = "text",
ellipse.type = "convex",
ggtheme = theme_bw(), labelsize = 3,repel=TRUE
)
# no repel
fviz_cluster(res.km, data = pc_data,
geom = "text",
ellipse.type = "convex",
ggtheme = theme_bw(), labelsize = 3,repel=FALSE
)
Okay. Let’s try TwoStep cluster analysis - using prcr
library(prcr)
clust<-create_profiles_cluster(pc_data, Comp.1,Comp.2, Comp.3, Comp.4, Comp.5, n_profiles = 7, to_scale = TRUE)
plot_profiles(clust, to_center = TRUE)
clust<-create_profiles_cluster(scaled_data, AQI_Good,Bachelor_Over_25,Per_Poverty,Gini,non_migration,Per_Sev_Hous, n_profiles = 7, to_scale = TRUE)
plot_profiles(clust, to_center = TRUE)